Comparison of isotype and V/D/J gene usage. Here put all these analyses in one place and in R, just for consistency with the other figures of this analysis.

Uses the updated data file. ANOVA was performed on GraphPad. Here read in those results to add annotation of statistical significance to the plots.

# anova test results
anova_results <- list.files("test.csv", path = "GraphPad results for Joseph/", full.names = TRUE)
anova_results <- anova_results[!grepl("^Age", basename(anova_results))]
anova_results <- lapply(anova_results, function(x){
  tb <- read.csv(x, stringsAsFactors = FALSE)
  tb[which(tb$pval == "<0.0001"), "pval"] <- 0
  tb[which(tb$pval == ">0.9999"), "pval"] <- 1
  tb$pval <- as.numeric(tb$pval)
  tb$subset <- unlist(strsplit(basename(x), split = "_"))[1]
  tb
})
vdj_test <- do.call("rbind", anova_results[1:4])
isotype_test <- anova_results[[5]]

vdj_test$SampleType <- replace(vdj_test$SampleType, which(vdj_test$SampleType == "COVID-19 Recovered"),
                               "COVID-19Recovered")
isotype_test$SampleType <- replace(isotype_test$SampleType, which(isotype_test$SampleType == "COVID-19 Recovered"),
                                   "COVID-19Recovered")
vdj_test$subset <- factor(vdj_test$subset,
                          levels = c("Bulk", "IgMmem", "IgA1", "IgG1"),
                          labels = c("all sequences", "IgM mutated", "IgA1", "IgG1"))
# This function gets the % of repertoire 
getPercentagesFromCountTbList <- function(tb_list, category_name = "Vgene")
{
  lapply(tb_list, function(tb){
    variables <- colnames(tb)[-ncol(tb)]
    variables <- variables[variables != category_name]
    if(length(variables) == 0){
      tb$all <- sum(tb$V1)
      tb$perc <- tb$V1 / tb$all
      tb <- tb[, c(category_name, "perc")]
    } else if(length(variables) > 0){
      sums <- ddply(tb, variables, summarise, all = sum(V1), .drop = FALSE)
      colnames(sums) <- c(variables, "all")
      tb <- merge(tb, sums, by = variables, sort = FALSE)
      tb$perc <- tb$V1 / tb$all
      tb <- tb[, c(category_name, variables, "perc")]
    }
    tb
  })
}

Isotype

isotype_usage <- list( 
  "IgG" = ddply(test_data[which(grepl("IgG[1-4]$", test_data$Subclass)),], 
                c("SampleType2", "PatientID", "Subclass"), nrow, .drop = FALSE) ,
  "IgA" = ddply(test_data[which(grepl("IgA[1-2]$", test_data$Subclass)),], 
                c("SampleType2", "PatientID", "Subclass"), nrow, .drop = FALSE) 
)
isotype_usage <- getPercentagesFromCountTbList(isotype_usage, category_name = "Subclass")
isotype_usage <- do.call("rbind", isotype_usage)
isotype_usage <- ddply(isotype_usage, c("SampleType2", "Subclass"), summarise, 
                       mean_perc = mean(perc, na.rm = TRUE),
                       sem = sd(perc, na.rm = TRUE) / (length(unique(PatientID)))^0.5)
isotype_usage <- merge(isotype_usage, isotype_test, by.x = c("SampleType2", "Subclass"),
                       by.y = c("SampleType", "Subclass"), all.x = TRUE, all.y = TRUE, sort = FALSE)
isotype_usage[which(is.na(isotype_usage$pval)), "pval"] <- 0
isotype_usage$SampleType2 <- factor(isotype_usage$SampleType2,
                                    levels = c("Healthy", "COVID-19", "COVID-19Recovered", "Ebola",
                                               "RSV-I", "RSV-U", "YFVD28"),
                                    labels = c("Healthy", "CV19", "CV19-Recovered", "EBOV",
                                               "RSV-I", "RSV-U", "YFVD28"))
saveRDS(isotype_usage, 'gene_usage_C.rds')

# IgG
g_igg <- ggplot(isotype_usage[which(grepl("^IgG", isotype_usage$Subclass)), ],
       aes(x = Subclass, y = mean_perc, ymin = mean_perc - sem, ymax = mean_perc +sem, fill = SampleType2)) +
  geom_bar(stat = "identity", colour = "black", position = position_dodge(width = .9)) + 
  scale_fill_brewer(type = "qual", name ="") + xlab("") +
  scale_y_continuous(labels = scales::percent, name = "Proportion of IgG repertoire", limits = c(0, 0.85)) +
  geom_errorbar(position = position_dodge(0.9), width = 0.2) +
  cowplot::theme_cowplot() + theme(legend.position = "none")

# IgA
g_iga <- ggplot(isotype_usage[which(grepl("^IgA", isotype_usage$Subclass)), ],
       aes(x = Subclass, y = mean_perc, ymin = mean_perc - sem, ymax = mean_perc +sem, fill = SampleType2)) +
  geom_bar(stat = "identity", colour = "black", position = position_dodge(width = .9)) + 
  scale_fill_brewer(type = "qual", name ="") + xlab("") +
  scale_y_continuous(labels = scales::percent, name = "Proportion of IgA repertoire", limits = c(0, 0.85)) +
  geom_errorbar(position = position_dodge(0.9), width = 0.2) +
  cowplot::theme_cowplot()

cowplot::plot_grid(
  g_iga, g_igg, nrow = 1, rel_widths = c(1, 1), align = "h", axis = "tb"
)

V gene

test_data$Vgene <- factor(test_data$Vgene)
v_usage <- list(
  "all" = ddply(test_data, c("Vgene", "SampleType2", "PatientID"), nrow, .drop = FALSE),
  "isotype" = ddply(test_data, c("Subclass", "Vgene", "SampleType2", "PatientID"), nrow, .drop = FALSE),
  "IgM mutated" = ddply(test_data[which(test_data$Class == "M" &
                                         test_data$MutationalStatus == "Mutated"), ],
                       c("Vgene", "SampleType2", "PatientID"), nrow, .drop = FALSE)
)
v_usage <- getPercentagesFromCountTbList( v_usage, category_name = "Vgene" )

# label the observations
v_usage[["all"]]$Subclass <- "all sequences"
#v_usage[["isotype"]]$Subclass <- paste("Ig", v_usage$isotype$Class, sep = "")
v_usage[["IgM mutated"]]$Subclass <- "IgM mutated"

# combine the tables
v_usage <- do.call("rbind", lapply(v_usage, function(tb){
  tb[, c("SampleType2", "Vgene", "Subclass", "PatientID" ,"perc")]
}))

v_usage0.5 <- ddply(v_usage, c("SampleType2", "Vgene", "Subclass"), summarise, 
                    mean_perc = mean(perc, na.rm = TRUE))
v_usage0.5 <- merge(v_usage0.5, v_usage0.5[which(v_usage0.5$SampleType2 == "Healthy"), -1],
                    by = c("Vgene", "Subclass"))
v_usage0.5$mean_perc <- v_usage0.5$mean_perc.x - v_usage0.5$mean_perc.y
# Select only the labels you want
v_usage1 <- v_usage0.5[v_usage0.5$Subclass %in% c("all sequences", "IgA1", "IgG1", "IgM mutated"), ]

# this below controls the order these plots are shown
v_usage1$Subclass <- factor(v_usage1$Subclass, 
                            levels = c("all sequences", "IgM mutated", "IgA1", "IgG1"))
#Select disease states
v_usage1 <- v_usage1[v_usage1$SampleType2 %in% c("COVID-19", "COVID-19Recovered", "Ebola", "Healthy", "RSV-I", "RSV-U", "YFVD28"), ]

# you can filter certain v genes you want if necessary, e.g.:
v_usage1 <- v_usage1[ !grepl("^IGK|^IGL", v_usage1$Vgene), ]
v_usage1 <- na.omit(v_usage1)

# filter for all anova significant ones first!!
v_usage1.1 <- v_usage1#[ v_usage1$Vgene %in% c("IGHJ3", "IGHJ4", "IGHJ5", "IGHJ6") , ]
v_usage1.1 <- v_usage1.1[which(v_usage1.1$Vgene %in% vdj_test$gene), ]

# BUlk sequences- order genes by clustering; here on only the 'all sequences' panel:
clustering <- hclust(dist(
  reshape2::acast( v_usage1.1[ which(v_usage1.1$Subclass == "all sequences" & v_usage1.1$SampleType2 != "Healthy"), ],
                   Vgene ~ SampleType2, value.var = "mean_perc")
))
Vgene_order <- rev(clustering$labels[clustering$order])
v_usage1.1$Vgene <- factor(v_usage1.1$Vgene, levels = Vgene_order)
# v_usage1.1 <- v_usage1.1[!is.na(v_usage1.1$Vgene), ]

# add back ANOVA p-values
v_usage1.1 <- merge(v_usage1.1, vdj_test, by.x = c("SampleType2", "Vgene", "Subclass"),
                    by.y = c("SampleType", "gene", "subset"), all.x = TRUE, all.y = FALSE, sort = FALSE)
v_usage1.1$sig <- (v_usage1.1$pval < 0.05)
v_usage1.1$logp <- -log(v_usage1.1$pval)
v_usage1.1$logp[which(is.infinite(v_usage1.1$logp))] <- 10
v_usage1.1$pval[which(is.na(v_usage1.1$pval))] <- 0
v_usage1.1$logp[which(v_usage1.1$SampleType2 == "Healthy")] <- -log(0.5)
# this below controls the order of x-axis
v_usage1.1$SampleType2 <- factor(v_usage1.1$SampleType2, 
                               levels = c("Healthy", "COVID-19", "COVID-19Recovered", "Ebola", "RSV-I", "RSV-U", "YFVD28"),
                               labels = c("Healthy", "CV19", "CV19-Recovered", "EBOV",
                                          "RSV-I", "RSV-U", "YFVD28"))

saveRDS(v_usage1.1, 'gene_usage_V.rds')
# plot
g <- do.call("c", lapply(c("all sequences", "IgM mutated", "IgA1", "IgG1"), function(x){
  if(x == "IgA1") legendpos = "bottom" else legendpos = "none"
  list(
    ggplot(v_usage1.1[v_usage1.1$Subclass == x & v_usage1.1$SampleType2 == "Healthy", ], 
           aes(x = mean_perc.x, y = Vgene)) + geom_bar(stat = "identity") +
      cowplot::theme_cowplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1),
                                       axis.text.y = element_blank()) +
      scale_x_reverse(name = "% sequences\nin Healthy", labels = scales::percent) + 
      scale_y_discrete(position = "right", name = "") + ggtitle(x),
    ggplot(v_usage1.1[v_usage1.1$Subclass == x & v_usage1.1$SampleType2 != "Healthy", ], 
           aes(x = SampleType2, y = Vgene, fill = mean_perc, size = sig)) + 
      geom_point(pch = 21) + 
      scale_fill_gradient2(low = "blue", high = "red", midpoint = 0, mid = "white",  
                           name = "% sequences\n(difference from\nHealthy)", labels = scales::percent) +
      scale_size_manual(values = c("TRUE" = 7, "FALSE" = 2),
                        breaks = c("TRUE", "FALSE"), name = "p < 0.05") +
      #scale_size_continuous(range = c(1, 7), limits = c(0, 10),
      #                      breaks = c(-log(1), -log(0.5), -log(0.25), -log(0.1), -log(0.05), -log(0.01)),
      #                      labels = c(1, 0.5, 0.25, 0.1, 0.05, 0.01), name = "p-value") +
      xlab("") + ylab("") + 
      cowplot::theme_cowplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1),
                                       legend.position = legendpos)
  )
}))
cowplot::plot_grid(
  plotlist = g, nrow = 1, axis = "tb", align = "h", rel_widths = rep(c(2,3),4)
)
## Warning: Removed 48 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 42 rows containing missing values (geom_point).

Plot the dendrogram:

plot(as.dendrogram(clustering))

#pheatmap::pheatmap(
#  mat = reshape2::acast( v_usage1.1[ which(v_usage1.1$Subclass == "all sequences" & v_usage1.1$SampleType2 != "Healthy"), ],
#                   Vgene ~ SampleType2, value.var = "mean_perc"), cluster_rows = clustering 
  #clustering_distance_rows = "euclidean", clustering_method = "complete"
#)
# selected genes
v_selected <- v_usage1.1[v_usage1.1$Vgene %in% c("IGHV1-69", "IGHV3-7", "IGHV3-23", "IGHV3-30",
                                                 "IGHV4-34", "IGHV4-39"), ]
v_selected$Vgene <- as.character(v_selected$Vgene)
g <- do.call("c", lapply(c("all sequences", "IgM mutated", "IgA1", "IgG1"), function(x){
  if(x == "IgA1") legendpos = "bottom" else legendpos = "none"
  tb <- v_selected[v_selected$Subclass == x & v_selected$SampleType2 != "Healthy", ]
  tb[which(tb$mean_perc < -0.02), "mean_perc"] <- -0.02
  tb[which(tb$mean_perc > 0.04), "mean_perc"] <- 0.04
  list(
    ggplot(v_selected[v_selected$Subclass == x & v_selected$SampleType2 == "Healthy", ], 
           aes(x = mean_perc.x, y = Vgene)) + geom_bar(stat = "identity") +
      cowplot::theme_cowplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1),
                                       axis.text.y = element_blank()) +
      scale_x_reverse(name = "% sequences\nin Healthy", labels = scales::percent) + 
      scale_y_discrete(position = "right", name = "") + ggtitle(x),
    ggplot(tb, 
           aes(x = SampleType2, y = Vgene, fill = mean_perc, size =sig)) + 
      geom_point(pch = 21) + 
      scale_fill_gradient2(low = "blue", high = "red", midpoint = 0, mid = "white",  
                           name = "% sequences\n(difference from\nHealthy)", 
                           limits = c(-0.02, 0.04), labels = scales::percent) +
      scale_size_manual(values = c("TRUE" = 7, "FALSE" = 2),
                        breaks = c("TRUE", "FALSE"), name = "p < 0.05") +
      #scale_size_continuous(range = c(1, 7), limits = c(0, 10),
      #                      breaks = c(-log(1), -log(0.5), -log(0.25), -log(0.1), -log(0.05), -log(0.01)),
      #                      labels = c(1, 0.5, 0.25, 0.1, 0.05, 0.01), name = "p-value") +
      xlab("") + ylab("") + 
      cowplot::theme_cowplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1),
                                       legend.position = legendpos)
  )
}))
g_v <- cowplot::plot_grid(
  plotlist = g, nrow = 1, axis = "tb", align = "h", rel_widths = rep(c(2,3),4)
)
g_v

D gene

test_data$Dgene <- factor(test_data$Dgene)
d_usage <- list(
  "all" = ddply(test_data, c("Dgene", "SampleType2", "PatientID"), nrow, .drop = FALSE),
  "isotype" = ddply(test_data, c("Subclass", "Dgene", "SampleType2", "PatientID"), nrow, .drop = FALSE),
  "IgM mutated" = ddply(test_data[which(test_data$Class == "M" &
                                         test_data$MutationalStatus == "Mutated"), ],
                       c("Dgene", "SampleType2", "PatientID"), nrow, .drop = FALSE)
)
d_usage <- getPercentagesFromCountTbList( d_usage, category_name = "Dgene" )

# label the observations
d_usage[["all"]]$Subclass <- "all sequences"
#d_usage[["isotype"]]$Subclass <- paste("Ig", d_usage$isotype$Class, sep = "")
d_usage[["IgM mutated"]]$Subclass <- "IgM mutated"

# combine the tables
d_usage <- do.call("rbind", lapply(d_usage, function(tb){
  tb[, c("SampleType2", "Dgene", "Subclass", "PatientID" ,"perc")]
}))

d_usage0.5 <- ddply(d_usage, c("SampleType2", "Dgene", "Subclass"), summarise, 
                    mean_perc = mean(perc, na.rm = TRUE))
d_usage0.5 <- merge(d_usage0.5, d_usage0.5[which(d_usage0.5$SampleType2 == "Healthy"), -1],
                    by = c("Dgene", "Subclass"))
d_usage0.5$mean_perc <- d_usage0.5$mean_perc.x - d_usage0.5$mean_perc.y

# Select only the labels you want
d_usage1 <- d_usage0.5[d_usage0.5$Subclass %in% c("all sequences", "IgA1", "IgG1", "IgM mutated"), ]

# this below controls the order these plots are shown
d_usage1$Subclass <- factor(d_usage1$Subclass, 
                            levels = c("all sequences", "IgM mutated", "IgA1", "IgG1"))
#Select disease states
d_usage1 <- d_usage1[d_usage1$SampleType2 %in% c("COVID-19", "COVID-19Recovered", "Ebola", "Healthy", "RSV-I", "RSV-U", "YFVD28"), ]

# you can filter certain v genes you want if necessary, e.g.:
d_usage1 <- d_usage1[ !grepl("^IGK|^IGL", d_usage1$Dgene), ]
d_usage1 <- na.omit(d_usage1)

# filter for all anova significant ones first!!
d_usage1.1 <- d_usage1#[ d_usage1$Dgene %in% c("IGHJ3", "IGHJ4", "IGHJ5", "IGHJ6") , ]
d_usage1.1 <- d_usage1.1[which(d_usage1.1$Dgene %in% vdj_test$gene), ]

# BUlk sequences- order genes by clustering; here on only the 'all sequences' panel:
clustering <- hclust(dist(
  reshape2::acast( d_usage1.1[ which(d_usage1.1$Subclass == "all sequences" & d_usage1.1$SampleType2 != "Healthy"), ],
                   Dgene ~ SampleType2, value.var = "mean_perc")
))
Dgene_order <- rev(clustering$labels[clustering$order])
d_usage1.1$Dgene <- factor(d_usage1.1$Dgene, levels = Dgene_order)
d_usage1.1 <- d_usage1.1[!is.na(d_usage1.1$Dgene), ]

# add back ANOVA p-values
d_usage1.1 <- merge(d_usage1.1, vdj_test, by.x = c("SampleType2", "Dgene", "Subclass"),
                    by.y = c("SampleType", "gene", "subset"), all.x = TRUE, all.y = FALSE, sort = FALSE)
d_usage1.1$sig <- (d_usage1.1$pval < 0.05)
d_usage1.1$logp <- -log(d_usage1.1$pval)
d_usage1.1$logp[which(is.infinite(d_usage1.1$logp))] <- 10
d_usage1.1$pval[which(is.na(d_usage1.1$pval))] <- 0
d_usage1.1$logp[which(d_usage1.1$SampleType2 == "Healthy")] <- -log(0.5)
# this below controls the order of x-axis
d_usage1.1$SampleType2 <- factor(d_usage1.1$SampleType2, 
                               levels = c("Healthy", "COVID-19", "COVID-19Recovered", "Ebola", "RSV-I", "RSV-U", "YFVD28"),
                               labels = c("Healthy", "CV19", "CV19-Recovered", "EBOV",
                                          "RSV-I", "RSV-U", "YFVD28"))

saveRDS(d_usage1.1, 'gene_usage_D.rds')

# plot
g <- do.call("c", lapply(c("all sequences", "IgM mutated", "IgA1", "IgG1"), function(x){
  if(x == "IgA1") legendpos = "bottom" else legendpos = "none"
  list(
    ggplot(d_usage1.1[d_usage1.1$Subclass == x & d_usage1.1$SampleType2 == "Healthy", ], 
           aes(x = mean_perc.x, y = Dgene)) + geom_bar(stat = "identity") +
      cowplot::theme_cowplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1),
                                       axis.text.y = element_blank()) +
      scale_x_reverse(name = "% sequences\nin Healthy", labels = scales::percent) + 
      scale_y_discrete(position = "right", name = "") + ggtitle(x),
    ggplot(d_usage1.1[d_usage1.1$Subclass == x & d_usage1.1$SampleType2 != "Healthy", ], 
           aes(x = SampleType2, y = Dgene, fill = mean_perc, size =sig)) + 
      geom_point(pch = 21) + 
      scale_fill_gradient2(low = "blue", high = "red", midpoint = 0, mid = "white",  
                           name = "% sequences\n(difference from\nHealthy)", labels = scales::percent) +
      scale_size_manual(values = c("TRUE" = 7, "FALSE" = 2),
                        breaks = c("TRUE", "FALSE"), name = "p < 0.05") +
      #scale_size_continuous(range = c(1, 7), limits = c(0, 10),
      #                      breaks = c(-log(1), -log(0.5), -log(0.25), -log(0.1), -log(0.05), -log(0.01)),
      #                      labels = c(1, 0.5, 0.25, 0.1, 0.05, 0.01), name = "p-value") +
      xlab("") + ylab("") + 
      cowplot::theme_cowplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1),
                                       legend.position = legendpos)
  )
}))
cowplot::plot_grid(
  plotlist = g, nrow = 1, axis = "tb", align = "h", rel_widths = rep(c(2,3),4)
)

plot the dendrogram:

plot(as.dendrogram(clustering))

#pheatmap::pheatmap(
#  mat = reshape2::acast( d_usage1.1[ which(d_usage1.1$Subclass == "all sequences" & d_usage1.1$SampleType2 != "Healthy"), ],
#                   Dgene ~ SampleType2, value.var = "mean_perc"), cluster_rows = clustering
  #clustering_distance_rows = "euclidean", clustering_method = "complete"
#)
# selected genes
#d_selected <- d_usage1.1[d_usage1.1$Dgene %in% c("IGHD3-10", "IGHD3-22", "IGHD3-9", "IGHD1-26", "IGHD2-15", 
#                                          "IGHD2-2", "IGHD3-3", "IGHD6-19"), ]
d_selected <- d_usage1.1[d_usage1.1$Dgene %in% c("IGHD2-2", "IGHD3-3"), ]
d_selected$Dgene <- as.character(d_selected$Dgene)

g <- do.call("c", lapply(c("all sequences", "IgM mutated", "IgA1", "IgG1"), function(x){
  if(x == "IgA1") legendpos = "bottom" else legendpos = "none"
  tb <- d_selected[d_selected$Subclass == x & d_selected$SampleType2 != "Healthy", ]
  tb[which(tb$mean_perc < -0.02), "mean_perc"] <- -0.02
  tb[which(tb$mean_perc > 0.04), "mean_perc"] <- 0.04
  list(
    ggplot(d_selected[d_selected$Subclass == x & d_selected$SampleType2 == "Healthy", ], 
           aes(x = mean_perc.x, y = Dgene)) + geom_bar(stat = "identity") +
      cowplot::theme_cowplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1),
                                       axis.text.y = element_blank()) +
      scale_x_reverse(name = "% sequences\nin Healthy", labels = scales::percent) + 
      scale_y_discrete(position = "right", name = "") + ggtitle(x),
    ggplot(tb, 
           aes(x = SampleType2, y = Dgene, fill = mean_perc, size =sig)) + 
      geom_point(pch = 21) + 
      scale_fill_gradient2(low = "blue", high = "red", midpoint = 0, mid = "white",  
                           name = "% sequences\n(difference from\nHealthy)", 
                           limits = c(-0.02, 0.04), labels = scales::percent) +
      scale_size_manual(values = c("TRUE" = 7, "FALSE" = 2),
                        breaks = c("TRUE", "FALSE"), name = "p < 0.05") +
      #scale_size_continuous(range = c(1, 7), limits = c(0, 10),
      #                      breaks = c(-log(1), -log(0.5), -log(0.25), -log(0.1), -log(0.05), -log(0.01)),
      #                      labels = c(1, 0.5, 0.25, 0.1, 0.05, 0.01), name = "p-value") +
      xlab("") + ylab("") + 
      cowplot::theme_cowplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1),
                                       legend.position = legendpos)
  )
}))
g_d <- cowplot::plot_grid(
  plotlist = g, nrow = 1, axis = "tb", align = "h", rel_widths = rep(c(2,3),4)
)
g_d

J gene

test_data$Jfamily <- factor(test_data$Jfamily)
j_usage <- list(
  "all" = ddply(test_data, c("Jfamily", "SampleType2", "PatientID"), nrow, .drop = FALSE),
  "isotype" = ddply(test_data, c("Subclass", "Jfamily", "SampleType2", "PatientID"), nrow, .drop = FALSE),
  "IgM mutated" = ddply(test_data[which(test_data$Class == "M" &
                                         test_data$MutationalStatus == "Mutated"), ],
                       c("Jfamily", "SampleType2", "PatientID"), nrow, .drop = FALSE)
)
j_usage <- getPercentagesFromCountTbList( j_usage, category_name = "Jfamily" )

# label the observations
j_usage[["all"]]$Subclass <- "all sequences"
#j_usage[["isotype"]]$Subclass <- paste("Ig", j_usage$isotype$Class, sep = "")
j_usage[["IgM mutated"]]$Subclass <- "IgM mutated"

# combine the tables
j_usage <- do.call("rbind", lapply(j_usage, function(tb){
  tb[, c("SampleType2", "Jfamily", "Subclass", "PatientID" ,"perc")]
}))

j_usage0.5 <- ddply(j_usage, c("SampleType2", "Jfamily", "Subclass"), summarise, 
                    mean_perc = mean(perc, na.rm = TRUE))
j_usage0.5 <- merge(j_usage0.5, j_usage0.5[which(j_usage0.5$SampleType2 == "Healthy"), -1],
                    by = c("Jfamily", "Subclass"))
j_usage0.5$mean_perc <- j_usage0.5$mean_perc.x - j_usage0.5$mean_perc.y

# Select only the labels you want
j_usage1 <- j_usage0.5[j_usage0.5$Subclass %in% c("all sequences", "IgA1", "IgG1", "IgM mutated"), ]

# this below controls the order these plots are shown
j_usage1$Subclass <- factor(j_usage1$Subclass, 
                            levels = c("all sequences", "IgM mutated", "IgA1", "IgG1"))
#Select disease states
j_usage1 <- j_usage1[j_usage1$SampleType2 %in% c("COVID-19", "COVID-19Recovered", "Ebola", "Healthy", "RSV-I", "RSV-U", "YFVD28"), ]

# you can filter certain v genes you want if necessary, e.g.:
j_usage1 <- j_usage1[ !grepl("^IGK|^IGL", j_usage1$Jfamily), ]
j_usage1 <- na.omit(j_usage1)

# filter for all anova significant ones first!!
j_usage1.1 <- j_usage1#[ j_usage1$Jfamily %in% c("IGHJ3", "IGHJ4", "IGHJ5", "IGHJ6") , ]


# BUlk sequences- order genes by clustering; here on only the 'all sequences' panel:
clustering <- hclust(dist(
  reshape2::acast( j_usage1.1[ which(j_usage1.1$Subclass == "all sequences"), ],
                   Jfamily ~ SampleType2, value.var = "mean_perc")
))
Jfamily_order <- clustering$labels[clustering$order]
# j_usage1.1$Jfamily <- factor(j_usage1.1$Jfamily, levels = Jfamily_order)

# add back ANOVA p-values
j_usage1.1 <- merge(j_usage1.1, vdj_test, by.x = c("SampleType2", "Jfamily", "Subclass"),
                    by.y = c("SampleType", "gene", "subset"), all.x = TRUE, all.y = FALSE, sort = FALSE)
j_usage1.1$sig <- (j_usage1.1$pval < 0.05)
j_usage1.1$logp <- -log(j_usage1.1$pval)
j_usage1.1$logp[which(is.infinite(j_usage1.1$logp))] <- 10
j_usage1.1$pval[which(is.na(j_usage1.1$pval))] <- 0
j_usage1.1$logp[which(j_usage1.1$SampleType2 == "Healthy")] <- -log(0.5)
# this below controls the order of x-axis
j_usage1.1$SampleType2 <- factor(j_usage1.1$SampleType2, 
                               levels = c("Healthy", "COVID-19", "COVID-19Recovered", "Ebola", "RSV-I", "RSV-U", "YFVD28"),
                               labels = c("Healthy", "CV19", "CV19-Recovered", "EBOV",
                                          "RSV-I", "RSV-U", "YFVD28"))

saveRDS(j_usage1.1, 'gene_usage_J.rds')

# plot
g <- do.call("c", lapply(c("all sequences", "IgM mutated", "IgA1", "IgG1"), function(x){
  if(x == "IgA1") legendpos = "bottom" else legendpos = "none"
  list(
    ggplot(j_usage1.1[j_usage1.1$Subclass == x & j_usage1.1$SampleType2 == "Healthy", ], 
           aes(x = mean_perc.x, y = Jfamily)) + geom_bar(stat = "identity") +
      cowplot::theme_cowplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1),
                                       axis.text.y = element_blank()) +
      scale_x_reverse(name = "% sequences\nin Healthy", labels = scales::percent) + 
      scale_y_discrete(position = "right", name = "") + ggtitle(x),
    ggplot(j_usage1.1[j_usage1.1$Subclass == x & j_usage1.1$SampleType2 != "Healthy", ], 
           aes(x = SampleType2, y = Jfamily, fill = mean_perc, size =sig)) + 
      geom_point(pch = 21) + 
      scale_fill_gradient2(low = "blue", high = "red", midpoint = 0, mid = "white",  
                           name = "% sequences\n(difference from\nHealthy)", labels = scales::percent) +
      scale_size_manual(values = c("TRUE" = 7, "FALSE" = 2),
                        breaks = c("TRUE", "FALSE"), name = "p < 0.05") +
      #scale_size_continuous(range = c(1, 7), limits = c(0, 10),
      #                      breaks = c(-log(1), -log(0.5), -log(0.25), -log(0.1), -log(0.05), -log(0.01)),
      #                      labels = c(1, 0.5, 0.25, 0.1, 0.05, 0.01), name = "p-value") +
      xlab("") + ylab("") + 
      cowplot::theme_cowplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1),
                                       legend.position = legendpos)
  )
}))
cowplot::plot_grid(
  plotlist = g, nrow = 1, axis = "tb", align = "h", rel_widths = rep(c(2,3),4)
)

# J4 and J6 only
g <- do.call("c", lapply(c("all sequences", "IgM mutated", "IgA1", "IgG1"), function(x){
  if(x == "IgA1") legendpos = "bottom" else legendpos = "none"
  tb <- j_usage1.1[j_usage1.1$Subclass == x & j_usage1.1$SampleType2 != "Healthy" &
                        j_usage1.1$Jfamily %in% c("IGHJ4", "IGHJ6"), ]
  tb[which(tb$mean_perc < -0.02), "mean_perc"] <- -0.02
  tb[which(tb$mean_perc > 0.04), "mean_perc"] <- 0.04
  list(
    ggplot(j_usage1.1[j_usage1.1$Subclass == x & j_usage1.1$SampleType2 == "Healthy" &
                        j_usage1.1$Jfamily %in% c("IGHJ4", "IGHJ6"), ], 
           aes(x = mean_perc.x, y = Jfamily)) + geom_bar(stat = "identity") +
      cowplot::theme_cowplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1),
                                       axis.text.y = element_blank()) +
      scale_x_reverse(name = "% sequences\nin Healthy", labels = scales::percent) + 
      scale_y_discrete(position = "right", name = "") + ggtitle(x),
    ggplot(tb, 
           aes(x = SampleType2, y = Jfamily, fill = mean_perc, size =sig)) + 
      geom_point(pch = 21) + 
      scale_fill_gradient2(low = "blue", high = "red", midpoint = 0, mid = "white",  
                           name = "% sequences\n(difference from\nHealthy)",
                           limits = c(-0.02, 0.04), labels = scales::percent) +
      scale_size_manual(values = c("TRUE" = 7, "FALSE" = 2),
                        breaks = c("TRUE", "FALSE"), name = "p < 0.05") +
      #scale_size_continuous(range = c(1, 7), limits = c(0, 10),
      #                      breaks = c(-log(1), -log(0.5), -log(0.25), -log(0.1), -log(0.05), -log(0.01)),
      #                      labels = c(1, 0.5, 0.25, 0.1, 0.05, 0.01), name = "p-value") +
      xlab("") + ylab("") + 
      cowplot::theme_cowplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1),
                                       legend.position = legendpos)
  )
}))
g_j <- cowplot::plot_grid(
  plotlist = g, nrow = 1, axis = "tb", align = "h", rel_widths = rep(c(2,3),4)
)

selected genes

cowplot::plot_grid(
  g_v, g_d, g_j, ncol = 1, axis = "lr", align = "v", rel_heights = c(2.5, 2, 2)
)